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In [1]: import pandas as pd 


Loan Prediction - Jupyter Notebook 


import matplotlib.pyplot as plt 


import seaborn as sns 


import numpy as np 
sns.set_theme(color_codes=True) 


In [2]: df = pd.read csv('loan train.csv') 


df .head() 
out[2]: 
Gender Married Dependents Education Self Employed Applicant Income Coapplicant Income Loan 
0 Male No 0 Graduate No 584900 0.0 
4. Male Yes 4 Graduate No 458300 150800.0 
2 Male Yes 0 Graduate Yes 300000 0.0 
3 Male Yes 0 aue No 258300 235800.0 
4 Male No 0 Graduate No 600000 0.0 


Data Preprocessing Part 1 


In [3]: #Check the number of unique value on object datatype 
df.select, dtypes(include='object').nunique() 


Out[3]: Gender 
Married 
Dependents 
Education 
Self_Employed 
Area 
Status 
dtype: int64 
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Exploratory Data Analysis 
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In [4]: # List of categorical variables to plot 
‘Dependents’, ‘Education’, 


cat_vars = ['Gender', 


‘Married’, 


'Self Employed', 


# create figure with subplots 
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 10)) 


axs = axs.flatten() 


# create barplot for each categorical variable 


for i, var in enumerate(cat_vars): 
sns.countplot(x=var, hue='Status', data=df, ax=axs[i]) 


axs[i].set xticklabels(axs[i].get xticklabels(), rotation-90) 


# adjust spacing between subplots 


fig.tight layout() 


4 show plot 
plt.show() 


E 


sms 
E EM 
E 
pa t 
i i 
? U 


Noe 


o 


Sel. Enployed 


localhost:8890/notebooks/Loan Prediction.ipynb 


"s 


El 
us 
E 

* 


* 


souten 


8 


D 


Sans 


[UM 


Dependents 


ES 
E | 
E 


"7 palan © 


24 


an 
w 
El 


D 


'Area', 'Credit History', 'Dependents'] 


Graduate 
Not Graduate 


Education 


" 


Dependents 


Sous 


2/26 


4/23/23, 12:38 AM 


In [5]: 


it get List of categorical variables 


Loan Prediction - Jupyter Notebook 


import warnings 
warnings. filterwarnings("ignore") 


cat vans = ['Gender', ‘Married’, ‘Dependents’, 'Education', 
"Self Employed', 'Area', "Credit History', 'Dependents'] 


# create figure with subplots 
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(15, 10)) 
axs = axs.flatten() 


# create histplot for each categorical variable 

for i, var in enumerate(cat vans): 
sns.histplot(x=var, hue='Status', data-df, ax=axs[i], multiple="fill", kde=False, 
axs[i].set xticklabels(df[var].unique(), rotation-90) 
axs[i].set xlabel(var) 


4 adjust spacing between subplots 
fig.tight layout() 


# show plot 
plt.show() 
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In [6]: num vars = ['Applicant Income', 'Coapplicant Income', 'Term' 


fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 7)) 
axs = axs.flatten() 


for i, var in enumerate(num vans): 
sns.boxplot(x=var, data=df, ax=axs[i]) 


fig.tight_layout() 


plt.show() 
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In [7]: 


Loan Prediction - Jupyter Notebook 
num vars = ['Applicant Income', 'Coapplicant Income', 'Term'] 


fig, axs - plt.subplots(nrows-1, ncols-3, figsize-(15, 7)) 
axs - axs.flatten() 


for i, var in enumerate(num vars): 
sns.violinplot(x-var, data-df, ax-axs[i]) 


fig.tight layout() 
plt.show() 
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In [8]: 


Loan Prediction - Jupyter Notebook 
num vars = ['Applicant Income', 'Coapplicant Income', 'Term'] 


fig, axs - plt.subplots(nrows=1, ncols-3, figsize=(15, 7)) 
axs - axs.flatten() 


for i, var in enumerate(num vans): 
sns.violinplot(x-var, y-'Status', data-df, ax-axs[i]) 


fig.tight layout() 


plt.show() 
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In [9]: num vars = ['Applicant Income', 'Coapplicant Income', ‘Term’ 


fig, axs - plt.subplots(nrows-1, ncols-3, figsize-(15, 7)) 
axs - axs.flatten() 


for i, var in enumerate(num vars): 
sns.histplot(x-var, data-df, ax-axs[i]) 


fig.tight layout() 


plt.show() 
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In [10]: 


In [11]: 


out[11]: 


num_vars = ['Applicant_Income', 


fig, axs = 
axs = axs.flatten() 


for i, var in enumerate(num 


Loan Prediction - Jupyter Notebook 


vars): 


"Coapplicant, Income", 'Term'] 


plt.subplots(nrows=1, ncols=3, figsize=(15, 7)) 


sns.histplot(x=var, data=df, hue='Status', ax=axs[i]) 


fig.tight layout() 
plt.show() 
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Data Preprocessing Part 2 


zo 


p 


df.head() 
Gender Married Dependents Education Self Employed Applicant Income Coapplicant Income Loan 
0 Male No 0 Graduate No 584900 0.0 
4 Mae — Yes 1 Graduate No 458300 150800.0 
2 Male — Yes O Graduate Yes 300000 0.0 
3 Male Yes 0' cali No 258300 235800.0 
4 Male No 0 Graduate No 600000 0.0 
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In [12]: 


out[12]: 


In [13]: 


out[13]: 


In [14]: 


Out[14]: 


In [15]: 


Loan Prediction - Jupyter Notebook 


#Check the missing value 
check missing = df.isnull().sum() + 100 / df.shape[@ 
check missing[check missing » 0].sort values(ascending-False) 


Credit History ^ 8.143322 
Self Employed 5.211726 
Dependents 2.442997 
Term 2.280130 
Gender 2.117264 
Married 0.488599 


dtype: float64 


# Fill null values with 'Unknown' 
df.fillna('Unknown', inplace-True) 


#Check the missing value again 
check missing = df.isnull().sum() + 188 / df.shape[0] 
check missing[check missing > 0].sort, values(ascending-False) 


Series([], dtype: float64) 


df .dtypes 
Gender object 
Married object 
Dependents object 
Education object 
Self Employed object 
Applicant Income int64 
Coapplicant Income — float64 
Loan Amount int64 
Term object 
Credit History object 
Area object 
Status object 


dtype: object 


Label Encoding for Object datatype 


# Loop over each column in the DataFrame where dtype is 'object' 
for col in df.select dtypes(include-['object']).columns: 


# Print the column name and the unique values 
print(f"(col): (df[col].unique())") 


Gender: ['Male' 'Female' 'Unknown'] 

Married: ['No' 'Yes' 'Unknown'] 

Dependents: ['@' '1' '2' '3+' 'Unknown'] 

Education: ['Graduate' 'Not Graduate'] 

Self Employed: ['No' 'Yes' 'Unknown'] 

Term: [360.0 120.0 240.0 'Unknown' 180.0 60.0 300.0 480.0 36.0 84.0 12.0] 
Credit History: [1.0 0.0 'Unknown' 

Area: ['Urban' 'Rural' 'Semiurban'] 

Status: ['Y' 'N'] 
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In [16]: 


In [17]: 


Loan Prediction - Jupyter Notebook 


# Convert selected columns to string data type 
df[['Term', ‘Credit History']] = df[['Term', 


from sklearn import preprocessing 


# Loop over each column in the DataFrame where dtype is ‘object’ 
for col in df.select dtypes(include=['object']).columns: 


# Initialize a LabelEncoder object 


label_encoder = preprocessing.LabelEncoder() 


# Fit the encoder to the unique values in the column 


label encoder.fit(df[col].unique()) 


# Transform the column using the encoder 
df[col] - label encoder.transform(df[col]) 


# Print the column name and the unique encoded values 


print(f"(col): (df[col].unique())") 


Gender: [1 8 2] 

Married: [@ 2 1] 

Dependents: [0 1 2 3 4] 

Education: (8 1] 

Self Employed: [0 2 1] 

Term: [6 1 319 2 8 4 7 5 9 8) 
Credit History: [1 @ 2] 

Area: [2 @ 1] 

Status: [1 0] 


Check if the Label 'Status' is balanced or not 
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In [18]: sns.countplot(df['Status']) 
df['Status'].value counts() 


Out[18]: 1 422 
e 192 
Name: Status, dtype: int64 


400 


350 


Status 


Oversampling Minority Class to balance the 
Label 


In [19]: from sklearn.utils import resample 

#create two different dataframe of majority and minority class 

df majority - df[(df['Status']--1)] 

df minority - df[(df['Status']--0)] 

# upsample minority class 

df minority upsampled - resample(df minority, 
replace-True, # sample with repLacement 
n samples- 422, # to match majority class 
random stateze) # reproducible results 

4 Combine majority class with upsampled minority class 

df upsampled - pd.concat([df minority upsampled, df majority]) 
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In [20]: sns.countplot(df upsampled['Status']) 
df upsampled['Status'].value counts() 


Out[20]: @ 422 
1 422 
Name: Status, dtype: int64 


400 


Status 


Remove Outlier using IQR because there are alot of 
extreme value 


In [21]: df_upsampled.shape 


Out[21]: (844, 12) 
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In [22]: # specify the columns to remove outliers from dataframe 


column_names = ['Applicant Income', 


'Coapplicant Income', 


Loan Prediction - Jupyter Notebook 


'Term'] 


# remove outliers for each selected column using the IQR method 


for column name in column names: 
df upsampled[column name].quantile(0.25) 
df upsampled[column name].quantile(0.75) 


Qi = 
B= 
TOR = 


df upsampled.head() 


Q3 - 01 
df upsampled = df_upsampled[~((df_upsampled[column_name] « (Q1 - 1.5 * IQR)) | (d 


ME Gender Married Dependents Education Self Employed Applicant Income Coapplicant Income Lo. 
148 0 0 0 0 0 1000000 166600.0 
338 0 0 3 1 0 183000 0.0 
24 1 2 1 0 pi 371700 292500.0 
57 1 2 0 0 0 336600 220000.0 
107 1 0 0 1 1 733300 0.0 
» 
In [23]: #Check the shape after outlier removal 
df_upsampled. shape 
Out[23]: (614, 12) 
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In [24]: plt.figure(figsize=(15,12)) 
sns.heatmap(df upsampled.corr(), fmt-'.2g', annot-True) 


Out[24]: «AxesSubplot:» 
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In [25]: df upsampled.drop(columns='Term', inplace=True) 


Train Test Sp 


In [26]: X = df upsampled.drop('Status', axis=1) 
y = df upsampled['Status'] 


In [27]: #test size 20% and train size 80% 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
X train, X test, y train, y test = train test split(X,y, test_size=0.2,random_state=0 


Decision Tree 
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In [28]: from sklearn.tree import DecisionTreeClassifier 
from sklearn.model selection import GridSearchCV 
dtree - DecisionTreeClassifier() 
param grid - ( 

"max depth': (3, 4, 5, 6, 7, 8] 

'min samples split': [2, 3, 4], 

'min samples leaf': [1, 2, 3, 4] 
) 


# Perform a grid search with cross-validation to find the best hyperparameters 
grid search = GridSearchCV(dtree, param grid, cv-5) 
grid search.fit(X train, y train) 


# Print the best hyperparameters 
print(grid search.best params ) 


("max depth': 8, 'min samples leaf': 1, 'min samples split': 2) 

In [29]: from sklearn.tree import DecisionTreeClassifier 
dtree = DecisionTreeClassifier(random state-0, max depth-8, min samples leaf-1, min s 
dtree.fit(X train, y train) 

Out[29]: DecisionTreeClassifier(max depth-8, random state-0) 


In [30]: y pred = dtree.predict(X test) 
print("Accuracy Score :", round(accuracy, score(y test, y pred)*100 ,2), "X" 


Accuracy Score : 86.18 X 


In [31]: from sklearn.metrics import accuracy score, f1 score, precision score, recall score, | 
print('F-1 Score : ',(f1 score(y test, y pred, average-'micro'))) 
print('Precision Score : ',(precision score(y test, y pred, average-'micro'))) 
print('Recall Score : ',(recall score(y test, y pred, average='micro'))) 
print('Jaccard Score : ',(jaccard score(y test, y pred, average-'micro'))) 
print('Log Loss : ',(log loss(y test, y pred))) 


F-1 Score : @.861788617886179 
Precision Score : @.8617886178861789 
Recall Score : @.8617886178861789 
Jaccard Score 8.7571428571428571 
Log Loss : 4.773697527605633 
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In [32]: 


Loan Prediction - Jupyter Notebook 


imp_df = pd.DataFrame({ 
“Feature Name": X_train.columns, 
"Importance": dtree.feature_importances_ 


» 


i = imp df.sort values(by="Importance", ascending=False) 


fi2 = fi.head(10) 
plt.figure(figsize=(10,8)) 

sns.barplot(data-fi2, x='Importance', y='Feature Name’) 

plt.title('Top 10 Feature Importance Each Attributes (Decision Tree)', fontsize=18) 
plt.xlabel ('Importance', fontsize-16) 

plt.ylabel ('Feature Name', fontsize-16) 

plt.show() 


Top 10 Feature Importance Each Attributes (Decision Tree) 
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In [33]: import shap 
# compute SHAP values 
explainer = shap.TreeExplainer(dtree) 
shap_values = explainer.shap_values(X_test) 
shap.summary_plot(shap_values[1], X_test.values, feature_names = X_test.columns) 
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In [35]: import shap 
explainer = shap.TreeExplainer(dtree) 


shap_values = explainer.shap_values(X_test) 
shap.summary_plot(shap_values, X_test) 
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In [37]: from sklearn.metrics import confusion_matrix 
cm = confusion matrix(y test, y pred) 
plt.figure(figsize=(5,5)) 
sns.heatmap(data=cm, linewidths=.5, annot=True, cmap = 'Blues') 
plt.ylabel('Actual label') 
plt.xlabel('Predicted label') 


all sample title = ‘Accuracy Score for Decision Tree: (0)'.format(dtree.score(X test, 
plt.title(all sample title, size - 15) 


Out[37]: Text(0.5, 1.0, ‘Accuracy Score for Decision Tree: 0.8617886178861789') 


Accuracy Score for Decision Tree: 0.8617886178861789 
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In [38]: 


out[38]: 


Loan Prediction - Jupyter Notebook 


from sklearn.metrics import roc_curve, roc_auc_score 
y pred proba = dtree.predict proba(X test)[:][:,1] 


df actual predicted - pd.concat([pd.DataFrame(np.array(y test), columns-['y actual']) 
df actual predicted.index - y test.index 


fpr, tpr, tr - roc curve(df actual predicted['y actual'], df actual predicted['y pred 
auc = roc auc score(df actual predicted['y actual'], df actual predicted['y pred prob: 


plt.plot(fpr, tpr, label='AUC = %0.4f' Xauc) 
plt.plot(fpr, fpr, linestyle = '--', color='k') 
plt.xlabel('False Positive Rate’) 
plt.ylabel('True Positive Rate') 
plt.title('ROC Curve’, size = 15) 


plt.legend() 


«matplotlib.legend.Legend at 0x20f4afc3310» 


ROC Curve 
10  —— AUC - 0.8980 


0.8 


06 


0.4 


True Positive Rate 


02 


0.0 


0.0 0.2 0.4 0.6 0.8 1.0 
False Positive Rate 


Random Forest 
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In [39]: from sklearn.ensemble import RandomForestClassifier 
from sklearn.model selection import GridSearchCV 
rfc - RandomForestClassifier() 
param grid - ( 

'n estimators': [100, 200], 
'max depth': [None, 5, 10], 
'max features': ['sqrt', 'log2', None] 


) 


# Perform a grid search with cross-validation to find the best hyperparameters 
grid search = GridSearchCV(rfc, param grid, cv=5) 
grid search.fit(X train, y train) 


4 Print the best hyperparameters 
print(grid search.best params ) 


('max depth': None, 'max features': 'log2', 'n_estimators': 200) 


In [40]: from sklearn.ensemble import RandomForestClassifier 
rfc - RandomForestClassifier(random state-0, max features-'log2', n estimators-200) 
rfc.fit(X train, y train) 


Out[40]: RandomForestClassifier(max features-'log2', n estimators-200, random state-0) 


In [41]: y pred = rfc.predict(X test) 
print("Accuracy Score :", round(accuracy, score(y test, y pred)*100 ,2), "X") 


Accuracy Score : 95.12 X 


In [42]: from sklearn.metrics import accuracy score, f1 score, precision score, recall score, | 
print('F-1 Score : ',(f1 score(y test, y pred, average-'micro'))) 
print('Precision Score : ',(precision score(y test, y pred, average-'micro'))) 
print('Recall Score : ',(recall score(y test, y pred, average='micro'))) 
print('Jaccard Score : ',(jaccard score(y test, y pred, average-'micro'))) 
print('Log Loss : ',(log loss(y test, y pred))) 


F-1 Score : 8.9512195121951219 
Precision Score : @.9512195121951219 
Recall Score : @.9512195121951219 
Jaccard Score : 0.9069767441860465 
Log Loss : 1.6848443638958128 
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In [43]: 


Loan Prediction - Jupyter Notebook 


imp_df = pd.DataFrame({ 
“Feature Name": X_train.columns, 
"Importance": rfc.feature_importances_ 


» 


i = imp df.sort values(by="Importance", ascending=False) 


fi2 = fi.head(10) 
plt.figure(figsize=(10,8)) 

sns.barplot(data-fi2, x='Importance', y='Feature Name’) 

plt.title('Top 10 Feature Importance Each Attributes (Random Forest)', fontsize=18) 
plt.xlabel ('Importance', fontsize=16) 

plt.ylabel ('Feature Name', fontsize-16) 

plt.show() 


Top 10 Feature Importance Each Attributes (Random Forest) 
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In [44]: import shap 
# compute SHAP values 
explainer = shap.TreeExplainer(rfc) 
shap values = explainer.shap values(X test) 
shap.summary_plot(shap_values[1], X test.values, feature names = X test.columns) 
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In [45]: import shap 
explainer = shap.TreeExplainer(rfc) 


shap_values = explainer.shap_values(X_test) 
shap.summary_plot(shap_values, X_test) 
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In [46]: from sklearn.metrics import confusion_matrix 
cm = confusion matrix(y test, y pred) 
plt.figure(figsize=(5,5)) 
sns.heatmap(data=cm, linewidths=.5, annot=True, cmap = 'Blues') 
plt.ylabel('Actual label') 
plt.xlabel('Predicted label') 
all sample title = ‘Accuracy Score for Random Forest: (0)'.format(rfc.score(X test, y. 


plt.title(all sample title, size - 15) 
Out[46]: Text(0.5, 1.0, 'Accuracy Score for Random Forest: 0.9512195121951219') 


Accuracy Score for Random Forest: 0.9512195121951219 
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In [47]: 


out[a7]: 
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from sklearn.metrics import roc_curve, roc_auc_score 
y pred proba = rfc.predict proba(X test)[:][:,1] 


df actual predicted = pd.concat([pd.DataFrame(np.array(y test), columns-['y actual']) 
df actual predicted.index - y test.index 


fpr, tpr, tr - roc curve(df actual predicted['y actual'], df actual predicted['y pred 
auc - roc auc score(df actual predicted['y actual'], df actual predicted['y pred prob. 


plt.plot(fpr, tpr, label='AUC = X0.4f' Xauc) 
plt.plot(fpr, fpr, linestyle = '--', color='k') 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.title('ROC Curve', size - 15) 

plt.legend() 
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